*****************************************************************************************************
* Purpose: Clean CHARLS mortality data (interview and death dates) for analysis
* Written by:	Yuan Zhang and Hunter Green
* Last updated: 2024-12-28
* Stata version: 18.0
*****************************************************************************************************

* Toggle for whether Yuan is working on this
global Yuan = "F"


*****************************************************************************************************
* Options, global macros
*****************************************************************************************************
* Options
version 18.0
clear all
set more off
set varabbrev off
pause on

* Global folder macros
if "${Yuan}" == "T" {   //Yuan add your folder paths here
	* paper
	global paper_data "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw"
	* CHARLS
	global charls_2011 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2011"
	global charls_2013 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2013"
	global charls_2014 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2014"
	global charls_2015 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2015"
	global charls_2018 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2018"
	global charls_2020 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2020"
	global harmonized_charls "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/Harmonized"
	}
else {
	* paper
	global paper_data "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw"
	* CHARLS
	global charls_2011 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2011"
	global charls_2013 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2013"
	global charls_2014 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2014"
	global charls_2015 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2015"
	global charls_2018 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2018"
	global charls_2020 "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/2020"
	global harmonized_charls "/Users/dcflood/Library/CloudStorage/Dropbox-UniversityofMichigan/David Flood/HPACC/Aging projects/HRS diabetes mortality/Data/Raw/CHARLS/Harmonized"
}


*****************************************************************************************************
* Clean data
*****************************************************************************************************
*** CHARLS 2011 (Wave 1)
* Open CHARLS 2011 demographic data
use householdID ID using "${charls_2011}/demographic_background.dta", clear

* Merge CHARLS 2011 health care data, included to have sample size match cohort profile and Harmonized CHARLS
merge 1:1 ID using "${charls_2011}/health_care_and_insurance.dta", keepusing(ID householdID)
drop _merge

* Merge CHARLS 2011 health status data, included to have sample size match cohort profile and Harmonized CHARLS
merge 1:1 ID using "${charls_2011}/health_status_and_functioning.dta", keepusing(ID householdID)
drop _merge

* CHARLS 2011 indicator
gen in2011 = 1
*n=17,708

* Merge CHARLS 2011 interview dates
merge 1:1 ID using "${charls_2011}/weight.dta", keepusing(iyear imonth)
keep if _merge == 3
drop _merge

* Year and month of CHARLS 2011 interview
clonevar iw2011yr = iyear
clonevar iw2011mo = imonth
drop iyear imonth

destring iw2011yr iw2011mo, replace

gen iw2011_ym = ym(iw2011yr, iw2011mo)
format iw2011_ym %tm

* Change ID to merge later interviews
replace householdID = householdID + "0"
replace ID = householdID + substr(ID,-2,2)
drop householdID

* Save a temporary file
tempfile charls2011
save `charls2011'


*** CHARLS 2013 (Wave 2)
* Open CHARLS 2013 interview dates
use ID INDV_L_Died iyear imonth using "${charls_2013}/Weights.dta", clear
drop if mi(ID)

* Merge CHARLS 2013 exit interview
merge 1:1 ID using "${charls_2013}/Exit_Interview.dta", keepusing(exb001_1 exb001_2 exb002)
drop _merge

* Merge temporary file
merge 1:1 ID using `charls2011'
drop if _merge == 1
drop _merge

* Year and month of CHARLS 2013 interview
clonevar iw2013yr = iyear
clonevar iw2013mo = imonth
drop iyear imonth

destring iw2013yr iw2013mo, replace

gen iw2013_ym = ym(iw2013yr, iw2013mo)
format iw2013_ym %tm

* Deceased indicator
gen deceased =.
replace deceased = 1 if INDV_L_Died == 1
drop INDV_L_Died

* Year and month of death
gen died_yr = exb001_1
gen died_mo = exb001_2
drop exb001_1 exb001_2

* Check missings in year and month of death
tab died_yr if deceased == 1, m
tab died_mo if deceased == 1, m
tab died_yr
tab died_mo if !mi(died_yr)

* Impute missing month of death for 5 respondents with nonmissing year of death
* 	All died in 2011/2012, next-of-kin were interviewed in 2013
* 	Impute month as midpoint between 2011 interview and end of year that mortality occurred
list ID iw2011yr iw2011mo iw2011_ym died_yr died_mo iw2013_ym if !mi(died_yr) & mi(died_mo)

* Yuan code - CHECK HERE
gen _died_mo = round((12 - iw2011mo) / (2 + iw2011mo)) if mi(died_mo) & !mi(died_yr)
* End of Yuan code

* Hunter code
gen end_dyear_ym = ym(died_yr, 12)
format end_dyear_ym %tm

gen _died_ym = (((end_dyear_ym - iw2011_ym) / 2) + iw2011_ym) if mi(died_mo) & !mi(died_yr)
format _died_ym %tm

gen _died_mo2 = month(dofm(_died_ym))
* End of Hunter code

list ID iw2011yr iw2011mo iw2011_ym died_yr died_mo iw2013_ym _died_mo end_dyear_ym _died_ym _died_mo2 if !mi(died_yr) & mi(died_mo)

/* This is to confirm we see the same thing:
       +--------------------------------------------------------------------------------------------------------------------------+
       |           ID   iw2011yr   iw2011mo   iw2011~m   died_yr   died_mo   iw2013~m   _died_mo   end_dy~m   _died_ym   _died_~2 |
       |--------------------------------------------------------------------------------------------------------------------------|
 2771. | 057633235001       2012          2     2012m2      2012         .     2013m8          3    2012m12     2012m7          7 |
 4675. | 101791301002       2011         10    2011m10      2012         .     2013m8          0    2012m12     2012m5          5 |
 5500. | 110131325001       2011          7     2011m7      2011         .     2013m7          1    2011m12     2011m9          9 |
 9084. | 200104319001       2011          8     2011m8      2011         .     2013m8          0    2011m12    2011m10         10 |
10641. | 261676213001       2011          7     2011m7      2012         .     2013m7          1    2012m12     2012m3          3 |
       +--------------------------------------------------------------------------------------------------------------------------+
*/

replace died_mo = _died_mo2 if mi(died_mo) & !mi(died_yr)
drop _died_mo end_dyear_ym _died_ym _died_mo2

gen died_ym = ym(died_yr, died_mo)
format died_ym %tm

* Check reported ym of mortality: solar or lunar calendar
tab exb002
drop exb002

* Check: mortality should happen after the 2011 interview
count if (died_ym < iw2011_ym) & !mi(died_ym) & !mi(iw2011_ym)
* n=4: assign missing - treat these 4 R's ym of mortality as unknown
replace died_yr = . if (died_ym < iw2011_ym) & !mi(died_ym) & !mi(iw2011_ym)
replace died_mo = . if (died_ym < iw2011_ym) & !mi(died_ym) & !mi(iw2011_ym)
replace died_ym = . if (died_ym < iw2011_ym) & !mi(died_ym) & !mi(iw2011_ym)

* Mortality occurred at the same year & month of the 2011 interview
count if (died_ym == iw2011_ym) & !mi(died_ym) & !mi(iw2011_ym)
* n=11: assign 0.5 months of survival after the 2011 interview in analysis 

* For those whose ym of mortality is unknown: assign the midpoint of the 2011 and 2013 interview dates
gen _died_ym = (((iw2013_ym - iw2011_ym) / 2) + iw2011_ym) if deceased == 1 & mi(died_yr) & mi(died_mo)
format _died_ym %tm

replace died_ym = _died_ym if mi(died_ym) & !mi(_died_ym)
drop died_yr died_mo _died_ym


*** CHARLS 2014 (Life History Survey)
* Merge CHARLS 2014 sample information
merge 1:1 ID using "${charls_2014}/Sample_Infor.dta", keepusing(died iyear imonth)
drop if _merge == 2
drop _merge

* Merge CHARLS 2014 demographic data, needed for interview indicator
merge 1:1 ID using "${charls_2014}/Demographic_Backgrounds.dta", keepusing(ID)
drop if _merge == 2

* CHARLS 2014 indicator
gen in2014 = 0
replace in2014 = 1 if _merge == 3
drop _merge

* Check deceased and died variables
tab deceased died, m
* add 252 deceased Rs, 0 were recorded as deceased in earlier waves 

* Year and month of life history interview
clonevar iw2014yr = iyear
clonevar iw2014mo = imonth
drop iyear imonth

destring iw2014yr iw2014mo, replace

gen iw2014_ym = ym(iw2014yr, iw2014mo)
format iw2014_ym %tm

* Last seen in CHARLS
tab died, m
tab iw2013_ym if died == 1, m
* all 252 deceased respondents were seen in 2013

* Assign date of mortality as midpoint between 2013 and 2014 interview dates
gen _died_ym = (((iw2014_ym - iw2013_ym) / 2) + iw2013_ym) if died == 1
format _died_ym %tm

replace died_ym = _died_ym if died == 1
replace deceased = 1 if died == 1
drop died _died_ym


*** CHARLS 2015 (Wave 3)
* Merge CHARLS 2015 interview dates
merge 1:1 ID using "${charls_2015}/Sample_Infor.dta", keepusing(died iyear imonth)
drop if _merge == 2
drop _merge

* Check deceased and died variables
tab deceased died, m
* add 337 deceased Rs, 226 were recorded as deceased in earlier waves

* Year and month of wave 3 interview
clonevar iw2015yr = iyear
clonevar iw2015mo = imonth
drop iyear imonth

destring iw2015yr iw2015mo, replace

gen iw2015_ym = ym(iw2015yr, iw2015mo)
format iw2015_ym %tm

* Last seen in CHARLS
tab died if deceased != 1
tab iw2014_ym if deceased != 1 & died == 1
tab iw2013_ym if deceased != 1 & died == 1 & mi(iw2014_ym)
tab iw2011_ym if deceased != 1 & died == 1 & mi(iw2014_ym) & mi(iw2013_ym)
* 69 were last seen in 2011/12, 28 in 2013, and 240 in 2014

* Assign date of mortality at midpoint between 2015 interview date and last interview date
gen _died_ym =.
replace _died_ym = (((iw2015_ym - iw2014_ym) / 2) + iw2014_ym) if deceased != 1 & died == 1 & !mi(iw2014_ym)
replace _died_ym = (((iw2015_ym - iw2013_ym) / 2) + iw2013_ym) if deceased != 1 & died == 1 & mi(iw2014_ym) & !mi(iw2013_ym)
replace _died_ym = (((iw2015_ym - iw2011_ym) / 2) + iw2011_ym) if deceased != 1 & died == 1 & mi(iw2014_ym) & mi(iw2013_ym) & !mi(iw2011_ym)
format _died_ym %tm

replace died_ym = _died_ym if deceased != 1 & died == 1
replace deceased = 1 if deceased != 1 & died == 1
drop died _died_ym


*** CHARLS 2018 (Wave 4)
* Merge CHARLS 2018 interview dates
merge 1:1 ID using "${charls_2018}/Sample_Infor.dta", keepusing(died iyear imonth)
drop if _merge == 2
drop _merge

* Check deceased and died variables
tab deceased died, m
* add 829 deceased Rs, 0 were recorded as deceased in earlier waves

* Year and month of wave 4 interview
clonevar iw2018yr = iyear
clonevar iw2018mo = imonth
drop iyear imonth

destring iw2018yr iw2018mo, replace

gen iw2018_ym = ym(iw2018yr, iw2018mo)
format iw2018_ym %tm

* last seen in CHARLS
tab died, m
tab iw2015_ym if died == 1
tab iw2014_ym if died == 1 & mi(iw2015_ym) & !mi(iw2014_ym)
tab iw2013_ym if died == 1 & mi(iw2015_ym) & mi(iw2014_ym) & !mi(iw2013_ym)
tab iw2011_ym if died == 1 & mi(iw2015_ym) & mi(iw2014_ym) & mi(iw2013_ym) & !mi(iw2011_ym)
* 20 were last seen in 2011/12, 18 in 2013, 52 in 2014, 739 in 2015/16

* Assign date of mortality at midpoint between 2018 interview date and last interview date
gen _died_ym =.
replace _died_ym = (((iw2018_ym - iw2015_ym) / 2) + iw2015_ym) if died == 1 & !mi(iw2015_ym)
replace _died_ym = (((iw2018_ym - iw2014_ym) / 2) + iw2014_ym) if died == 1 & mi(iw2015_ym) & !mi(iw2014_ym)
replace _died_ym = (((iw2018_ym - iw2013_ym) / 2) + iw2013_ym) if died == 1 & mi(iw2015_ym) & mi(iw2014_ym) & !mi(iw2013_ym)
replace _died_ym = (((iw2018_ym - iw2011_ym) / 2) + iw2011_ym) if died == 1 & mi(iw2015_ym) & mi(iw2014_ym) & mi(iw2013_ym) & !mi(iw2011_ym)
format _died_ym %tm

replace died_ym = _died_ym if died == 1
replace deceased = 1 if died == 1
drop died _died_ym


*** CHARLS 2020 (Wave 5)
* Merge CHARLS 2020 interview dates
merge 1:1 ID using "${charls_2020}/Sample_Infor.dta", keepusing(died iyear imonth)
drop if _merge == 2
drop _merge

* Merge CHARLS 2020 exit interview
merge 1:1 ID using "${charls_2020}/Exit_Module.dta", keepusing(exb001_1 exb001_2 exb001_3 exb002)
drop if _merge == 2
drop _merge

* Merge CHARLS 2020 demographic data, needed for interview indicator
merge 1:1 ID using "${charls_2020}/Demographic_Background.dta", keepusing(ID) gen(_merge_demog)
drop if _merge_demog == 2

* Merge CHARLS 2020 health data, needed for interview indicator
merge 1:1 ID using "${charls_2020}/Health_Status_and_Functioning.dta", keepusing(ID) gen(_merge_health)
drop if _merge_health == 2

* CHARLS 2020 indicator
gen in2020 = 0
replace in2020 = 1 if _merge_demog == 3 | _merge_health == 3
drop _merge_demog _merge_health

* Check deceased and died variables
tab deceased died, m
* add 640 deceased Rs, 0 were recorded as deceased in earlier waves

* Year and month of wave 5 interview
clonevar iw2020yr = iyear
clonevar iw2020mo = imonth
drop iyear imonth

destring iw2020yr iw2020mo, replace

gen iw2020_ym = ym(iw2020yr, iw2020mo)
format iw2020_ym %tm

* Year and month of death
gen died_yr = exb001_1
gen died_mo = exb001_2
drop exb001_1 exb001_2 exb001_3

* Check missings in year and month of death
tab died_yr if died == 1, m
tab died_mo if died == 1, m
tab died_yr
tab died_mo if !mi(died_yr)

replace died_ym = ym(died_yr, died_mo) if died == 1

* Check reported ym of mortality: solar or lunar calendar
tab exb002 if died == 1
drop exb002

* Check: mortality should happen after the 2011 interview
count if (died_ym < iw2011_ym) & !mi(died_ym) & !mi(iw2011_ym)
* n=0

* last seen in CHARLS among deceased respondents with missing death ym
tab died if mi(died_ym), m
tab iw2018_ym if died == 1 & mi(died_ym)
tab iw2015_ym if died == 1 & mi(iw2018_ym) & mi(died_ym)
tab iw2014_ym if died == 1 & mi(iw2018_ym) & mi(iw2015_ym) & !mi(iw2014_ym) & mi(died_ym)
tab iw2013_ym if died == 1 & mi(iw2018_ym) & mi(iw2015_ym) & mi(iw2014_ym) & !mi(iw2013_ym) & mi(died_ym)
tab iw2011_ym if died == 1 & mi(iw2018_ym) & mi(iw2015_ym) & mi(iw2014_ym) & mi(iw2013_ym) & !mi(iw2011_ym) & mi(died_ym)
* 9 were last seen in 2011/12, 6 in 2013, 12 in 2014, 67 in 2015/16, 462 in 2018

* Assign date of mortality at midpoint between 2020 interview date and last interview date
gen _died_ym =.
replace _died_ym = (((iw2020_ym - iw2018_ym) / 2) + iw2018_ym) if died == 1 & mi(died_ym) & !mi(iw2018_ym)
replace _died_ym = (((iw2020_ym - iw2015_ym) / 2) + iw2015_ym) if died == 1 & mi(died_ym) & mi(iw2018_ym) & !mi(iw2015_ym)
replace _died_ym = (((iw2020_ym - iw2014_ym) / 2) + iw2014_ym) if died == 1 & mi(died_ym) & mi(iw2018_ym) & mi(iw2015_ym) & !mi(iw2014_ym)
replace _died_ym = (((iw2020_ym - iw2013_ym) / 2) + iw2013_ym) if died == 1 & mi(died_ym) & mi(iw2018_ym) & mi(iw2015_ym) & mi(iw2014_ym) & !mi(iw2013_ym)
replace _died_ym = (((iw2020_ym - iw2011_ym) / 2) + iw2011_ym) if died == 1 & mi(died_ym) & mi(iw2018_ym) & mi(iw2015_ym) & mi(iw2014_ym) & mi(iw2013_ym) & !mi(iw2011_ym)
format _died_ym %tm

replace died_ym = _died_ym if died == 1 & mi(died_ym)
replace deceased = 1 if died == 1
drop died died_yr died_mo _died_ym


*** Drop variables
drop iw2011yr iw2011mo iw2013yr iw2013mo iw2014yr iw2014mo iw2015yr iw2015mo iw2018yr iw2018mo iw2020yr iw2020mo


*** Year and month of biomarker collection (same as 2011 interview)
gen biomarker_ym = iw2011_ym
format biomarker_ym %tm


*** Death
* "True" death status and time
gen true_deceased = deceased
gen true_death_ym = died_ym
format true_death_ym %tm

* "Study" death status and time
rename deceased study_deceased
rename died_ym study_death_ym

* In study if died before 2020, otherwise censor
gen deathyr = year(dofm(study_death_ym))

replace study_deceased = . if deathyr == 2020
replace study_death_ym = . if deathyr == 2020


*** Censor
* Merge Harmonized CHARLS
merge 1:1 ID using "${harmonized_charls}/H_CHARLS_D_Data.dta", keepusing(inw2 inw3 inw4)
keep if _merge == 3
drop _merge

* Interview indicators
gen in2013 = inw2
gen in2015 = inw3
gen in2018 = inw4
drop inw2 inw3 inw4

* Set interview ym to missing if respondent not interviewed
replace iw2013_ym =. if in2013 == 0
replace iw2014_ym =. if in2014 == 0
replace iw2015_ym =. if in2015 == 0
replace iw2018_ym =. if in2018 == 0
replace iw2020_ym =. if in2020 == 0

* Indicator for end of study (December 2019)
gen end_study_ym = ym(2019, 12)
format end_study_ym %tm

* Year and month of censor
gen censor_ym = end_study_ym if deathyr == 2020
replace censor_ym = end_study_ym if !mi(iw2020_ym) & mi(study_death_ym) & mi(censor_ym)
replace censor_ym = iw2018_ym if !mi(iw2018_ym) & mi(iw2020_ym) & mi(study_death_ym) & mi(censor_ym)
replace censor_ym = iw2015_ym if !mi(iw2015_ym) & mi(iw2020_ym) & mi(iw2018_ym) & mi(study_death_ym) & mi(censor_ym)
replace censor_ym = iw2014_ym if !mi(iw2014_ym) & mi(iw2020_ym) & mi(iw2018_ym) & mi(iw2015_ym) & mi(study_death_ym) & mi(censor_ym)
replace censor_ym = iw2013_ym if !mi(iw2013_ym) & mi(iw2020_ym) & mi(iw2018_ym) & mi(iw2015_ym) & mi(iw2014_ym) & mi(study_death_ym) & mi(censor_ym)
format censor_ym %tm
drop deathyr end_study_ym


*** Order variables
order ID in2011 in2013 in2014 in2015 in2018 in2020 iw2011_ym iw2013_ym iw2014_ym iw2015_ym iw2018_ym iw2020_ym ///
      biomarker_ym true_deceased true_death_ym study_deceased study_death_ym censor_ym


*** Label variables
label variable in2011 "CHARLS in 2011 sample"
label variable in2013 "CHARLS in 2013 sample"
label variable in2014 "CHARLS in 2014 sample"
label variable in2015 "CHARLS in 2015 sample"
label variable in2018 "CHARLS in 2018 sample"
label variable in2020 "CHARLS in 2020 sample"
label variable iw2011_ym "CHARLS 2011 interview year & month"
label variable iw2013_ym "CHARLS 2013 interview year & month"
label variable iw2014_ym "CHARLS 2014 interview year & month"
label variable iw2015_ym "CHARLS 2015 interview year & month"
label variable iw2018_ym "CHARLS 2018 interview year & month"
label variable iw2020_ym "CHARLS 2020 interview year & month"
label variable biomarker_ym "CHARLS year & month of biomarker collection"
label variable true_deceased "R was deceased by CHARLS 2020 (True)"
label variable true_death_ym "CHARLS year & month of mortality (True)"
label variable study_deceased "R was deceased by December 2019 (Study)"
label variable study_death_ym "CHARLS year & month of mortality to December 2019 (Study)"
label variable censor_ym "CHARLS year & month of censor"


*** Save data
save "${paper_data}/charls_dates.dta", replace

